!pip install ydata_profiling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pandas_profiling import ProfileReport
Requirement already satisfied: ydata_profiling in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (4.0.0) Requirement already satisfied: visions[type_image_path]==0.7.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.7.5) Requirement already satisfied: seaborn<0.13,>=0.10.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.11.2) Requirement already satisfied: typeguard<2.14,>=2.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.13.3) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (6.0) Requirement already satisfied: matplotlib<3.7,>=3.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (3.5.2) Requirement already satisfied: htmlmin==0.1.12 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.1.12) Requirement already satisfied: scipy<1.10,>=1.4.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1) Requirement already satisfied: statsmodels<0.14,>=0.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.13.2) Requirement already satisfied: multimethod<1.10,>=1.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1) Requirement already satisfied: jinja2<3.2,>=2.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.11.3) Requirement already satisfied: numpy<1.24,>=1.16.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.21.5) Requirement already satisfied: pydantic<1.11,>=1.8.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.10.5) Requirement already satisfied: tqdm<4.65,>=4.48.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (4.64.1) Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.4.4) Requirement already satisfied: phik<0.13,>=0.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.12.3) Requirement already satisfied: requests<2.29,>=2.24.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.28.1) Requirement already satisfied: networkx>=2.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (2.8.4) Requirement already satisfied: attrs>=19.3.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (21.4.0) Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (0.2.0) Requirement already satisfied: imagehash in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (4.3.1) Requirement already satisfied: Pillow in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (9.2.0) Requirement already satisfied: MarkupSafe>=0.23 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from jinja2<3.2,>=2.11.1->ydata_profiling) (2.0.1) Requirement already satisfied: kiwisolver>=1.0.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (1.4.2) Requirement already satisfied: fonttools>=4.22.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (4.25.0) Requirement already satisfied: packaging>=20.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (21.3) Requirement already satisfied: python-dateutil>=2.7 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (2.8.2) Requirement already satisfied: pyparsing>=2.2.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (3.0.9) Requirement already satisfied: cycler>=0.10 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (0.11.0) Requirement already satisfied: pytz>=2020.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata_profiling) (2022.1) Requirement already satisfied: joblib>=0.14.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.1.0) Requirement already satisfied: typing-extensions>=4.2.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pydantic<1.11,>=1.8.1->ydata_profiling) (4.3.0) Requirement already satisfied: idna<4,>=2.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (3.3) Requirement already satisfied: certifi>=2017.4.17 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2022.9.24) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (1.26.11) Requirement already satisfied: charset-normalizer<3,>=2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2.0.4) Requirement already satisfied: patsy>=0.5.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from statsmodels<0.14,>=0.13.2->ydata_profiling) (0.5.2) Requirement already satisfied: six in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels<0.14,>=0.13.2->ydata_profiling) (1.16.0) Requirement already satisfied: PyWavelets in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imagehash->visions[type_image_path]==0.7.5->ydata_profiling) (1.3.0)
# loading dataset 1 (adult.data)
df1 = pd.read_csv('adult_data.csv', header = None)
df1.shape
(32561, 15)
# loading dataset 2 (adult.test)
df2 = pd.read_csv('adult_test.csv', header = None)
df2.shape
(16281, 15)
# combining 2 datasets
df = pd.concat([df1, df2], ignore_index = True)
# shape of the original dataset after combining
df.shape
(48842, 15)
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation', 'relationship','race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week',
'country', 'target income']
df.columns = column_names
df.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | country | target income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
EDA = ProfileReport(df, title = "EDA of the Adult Dataset", html={'style':{'full_width': True}})
EDA
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 48842.0 | 38.643585 | 13.710510 | 17.0 | 28.0 | 37.0 | 48.0 | 90.0 |
| fnlwgt | 48842.0 | 189664.134597 | 105604.025423 | 12285.0 | 117550.5 | 178144.5 | 237642.0 | 1490400.0 |
| education-num | 48842.0 | 10.078089 | 2.570973 | 1.0 | 9.0 | 10.0 | 12.0 | 16.0 |
| capital-gain | 48842.0 | 1079.067626 | 7452.019058 | 0.0 | 0.0 | 0.0 | 0.0 | 99999.0 |
| capital-loss | 48842.0 | 87.502314 | 403.004552 | 0.0 | 0.0 | 0.0 | 0.0 | 4356.0 |
| hours-per-week | 48842.0 | 40.422382 | 12.391444 | 1.0 | 40.0 | 40.0 | 45.0 | 99.0 |
# df.workclass.value_counts()
# df.occupation.value_counts()
# df.country.value_counts()
df_modified = df.copy()
df_modified.shape
(48842, 15)
# First: Identifying missing values in workclass, occupation and country
# df[df.occupation.str.contains(pat='?', case=False, regex=False)]
# df[df.workclass.str.contains(pat='?', case=False, regex=False)]
# df[df.country.str.contains(pat='?', case=False, regex=False)]
# removing the rows containign '?' in any of the columns - workclass, occupation and country
# and modifying the dataset
df_modified = df_modified[df_modified.occupation.str.contains(pat='?', case=False, regex=False) == False]
df_modified = df_modified[df_modified.workclass.str.contains(pat='?', case=False, regex=False) == False]
df_modified = df_modified[df_modified.country.str.contains(pat='?', case=False, regex=False) == False]
# shape of the dataset after removing '?'
df_modified.shape
(45222, 15)
# checking if all the "?" are actually removed
df_modified.workclass.value_counts()
# df.occupation.value_counts()
# df.country.value_counts()
Private 33307 Self-emp-not-inc 3796 Local-gov 3100 State-gov 1946 Self-emp-inc 1646 Federal-gov 1406 Without-pay 21 Name: workclass, dtype: int64
print(df_modified.education.value_counts())
HS-grad 14783 Some-college 9899 Bachelors 7570 Masters 2514 Assoc-voc 1959 11th 1619 Assoc-acdm 1507 10th 1223 7th-8th 823 Prof-school 785 9th 676 12th 577 Doctorate 544 5th-6th 449 1st-4th 222 Preschool 72 Name: education, dtype: int64
df_modified['education'] = df_modified['education'].str.replace('11th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('9th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('7th-8th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('5th-6th', 'Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('10th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('1st-4th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Preschool','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('12th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Bachelors','Bachelors', regex = True)
df_modified['education'] = df_modified['education'].str.replace('HS-grad','High Grad', regex = True)
df_modified['education'] = df_modified['education'].replace(['Masters', 'Prof-school'],'Masters', regex = True)
df_modified['education'] = df_modified['education'].replace(['Some-college','Assoc-acdm', 'Assoc-voc'],'Community College', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Doctorate','Doctorate', regex = True)
df_modified['education'].unique()
array([' Bachelors', ' High Grad', ' Dropout', ' Masters',
' Community College', ' Doctorate'], dtype=object)
print(df_modified.education.value_counts())
High Grad 14783 Community College 13365 Bachelors 7570 Dropout 5661 Masters 3299 Doctorate 544 Name: education, dtype: int64
print(df_modified['marital-status'].value_counts())
Married-civ-spouse 21055 Never-married 14598 Divorced 6297 Separated 1411 Widowed 1277 Married-spouse-absent 552 Married-AF-spouse 32 Name: marital-status, dtype: int64
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Never-married','Unmarried', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-civ-spouse','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Divorced','Separated', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-spouse-absent','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Separated','Separated', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-AF-spouse','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Widowed','Widowed', regex = True)
df_modified['marital-status'].unique()
array([' Unmarried', ' Married', ' Separated', ' Widowed'], dtype=object)
print(df_modified['marital-status'].value_counts())
Married 21639 Unmarried 14598 Separated 7708 Widowed 1277 Name: marital-status, dtype: int64
print(df_modified['target income'].value_counts())
<=50K 22654 <=50K. 11360 >50K 7508 >50K. 3700 Name: target income, dtype: int64
df_modified['target income'] = df_modified['target income'].str.replace('<=50K.', '<=50K',regex=False)
df_modified['target income'] = df_modified['target income'].str.replace('>50K.', '>50K', regex=False)
df_modified['target income'].unique()
array([' <=50K', ' >50K'], dtype=object)
print(df_modified['target income'].value_counts())
<=50K 34014 >50K 11208 Name: target income, dtype: int64
df_modified = df_modified.drop_duplicates()
# shape of the dataset after dropping duplicates
df_modified.shape
(45175, 15)
EDA_modified = ProfileReport(df_modified, title="EDA of modified dataset")
comparison_report = EDA.compare(EDA_modified)
comparison_report.to_file("original_vs_transformed.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
comparison_report